In the previous step we gather all known cancer hotspots by scavenging calls by - checking for overlap with amino acid positions in a curated and published cancer hotspot database - checking for overlap with non-coding region hotspots mutations , here we are using the in TERT promoter region

In this notebook, we will create consensus mafs from the filtered call scratch/hotspot-detection/*RDS. Since there are slight differences in read support from each caller, we will take a mean value to provide the information per site in the consensus maf file. All other vcf2maf columns will be added from strelka/mutect2

Setup

library("tidyverse")
library("maftools")

root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
data_dir <- file.path(root_dir, "data")

results_dir <- file.path(root_dir,
                         "analyses",
                         "hotspots-detection",
                         "results")
# Make output folder
if (!dir.exists(results_dir )) {
  dir.create(results_dir , recursive = TRUE)
}

scratch_dir <- file.path(
  root_dir,
  "scratch",
  "hotspot-detection"
)

# combined hotspot calls
combined_hotspot_calls <- list.files(path = scratch_dir,pattern = ".RDS",full.names = TRUE) %>%
  map(readRDS) %>% 
  bind_rows()

# hotspot mutation maf file output
output_file <- file.path(results_dir,"pbta-snv-hotspots-mutation.maf.tsv.gz")


# PBTA consensus maf 
maf_coltypes <- readRDS(file.path("input","maf_coltypes.RDS"))
consensus_maf_file <- read_tsv(file.path(data_dir,"pbta-snv-consensus-mutation.maf.tsv.gz"),
                               col_types = readr::cols(Entrez_Gene_Id = readr::col_character(),
                                                       ALLELE_NUM = readr::col_character(),
                                                       vcf_pos = readr::col_character(),
                                                       vcf_qual = readr::col_number()))
1270622 parsing failures.
row      col expected actual                                                                 file
  1 vcf_qual a number      . '/home/rstudio/OpenPBTA/data/pbta-snv-consensus-mutation.maf.tsv.gz'
  2 vcf_qual a number      . '/home/rstudio/OpenPBTA/data/pbta-snv-consensus-mutation.maf.tsv.gz'
  3 vcf_qual a number      . '/home/rstudio/OpenPBTA/data/pbta-snv-consensus-mutation.maf.tsv.gz'
  4 vcf_qual a number      . '/home/rstudio/OpenPBTA/data/pbta-snv-consensus-mutation.maf.tsv.gz'
  5 vcf_qual a number      . '/home/rstudio/OpenPBTA/data/pbta-snv-consensus-mutation.maf.tsv.gz'
... ........ ........ ...... ....................................................................
See problems(...) for more details.

Gather consensus maf file for hotspots

The main tasks are: - calculate a mean of read supports for REF and ALT from multiple callers which can be slightly different for the consenus maf file - create a maf format output to be appended on to the 3/3 consensus maf file


unique_cols <- c("t_depth",
                 "n_depth",
                 "t_ref_count",
                 "n_ref_count",
                 "t_alt_count",
                 "n_alt_count",
                 "caller",
                 "vcf_qual")


combined_hotspot_calls <- combined_hotspot_calls %>%
  unique() %>%
  group_by_at(setdiff(names(combined_hotspot_calls), unique_cols)) %>%
  summarise(
            t_depth= mean(as.numeric(t_depth)),
            n_depth = mean(as.numeric(n_depth)),
            t_ref_count = mean(as.numeric(t_ref_count)),
            n_ref_count = mean(as.numeric(n_ref_count)),
            t_alt_count = mean(as.numeric(t_alt_count)),
            n_alt_count = mean(as.numeric(n_alt_count)),
            caller_count = n(),
            caller = toString(caller),
            vcf_qual = mean(as.numeric(vcf_qual))) %>% 
  ungroup() %>%
  mutate(VAF=t_alt_count/(t_ref_count+t_alt_count))

Mean read support for combination of callers

We will try to identify the read support for each combination of callers to visualize if any callers call hostspot sites with low read support calls which


ggplot(combined_hotspot_calls[,c("caller","caller_count","t_alt_count")],
       aes(x=as.factor(caller_count),y=t_alt_count))+
  geom_violin()+ 
  ggpubr::stat_compare_means()

Hotspot sites called by just 1 caller seem to have majority low read support. Lets look at these calls that are unique to 1 caller.


unique_1caller_hotspots<- combined_hotspot_calls %>%
  filter(caller_count==1)

unique_1caller_hotspots

What’s the distrbution of the t_alt_count


summary.caller <- unique_1caller_hotspots %>%  
  group_by(caller) %>%
  tally()


ggplot(unique_1caller_hotspots[,c("caller","t_alt_count")],
       aes(x=as.factor(caller),y=t_alt_count))+
  geom_violin()+ 
  geom_jitter(height = 0, width = 0.1) +
  # add number of calls at the top
  geom_text(data=summary.caller ,aes(x = caller, y = 350, label=n), fontface =2, size = 4)+
  # draw a line where t_alt_count==5
  geom_hline(aes(yintercept=5,color="red"))+
  theme(legend.position="none")

NA
NA

Strelka has majority of the unique calls with a wide distribution of t_alt_count. Some calls from lancet,mutect2 are below the red line which denotes t_alt_count==5.

Merge the combined hotspots with consensus

We will first filter combined_hotspot_calls to keep only rows that are not found in consensus_maf_file


check_with_consensus<- consensus_maf_file %>%
  mutate("consensus"="Yes") %>%
  select(-one_of(unique_cols),-VAF,-vcf_pos)
Unknown columns: `caller`
combined_hotspot_calls %>%
  select(-caller_count,-caller,-Amino_Acid_Position,-unique_cols,-VAF) %>%
  mutate(hotspot="Yes") %>%
  left_join(check_with_consensus) %>%
  filter(is.na(consensus))
Joining, by = c("Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome", "Start_Position", "End_Position", "Strand", "Variant_Classification", "Variant_Type", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2", "dbSNP_RS", "dbSNP_Val_Status", "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode", "Match_Norm_Seq_Allele1", "Match_Norm_Seq_Allele2", "Tumor_Validation_Allele1", "Tumor_Validation_Allele2", "Match_Norm_Validation_Allele1", "Match_Norm_Validation_Allele2", "Verification_Status", "Validation_Status", "Mutation_Status", "Sequencing_Phase", "Sequence_Source", "Validation_Method", "Score", "BAM_File", "Sequencer", "Tumor_Sample_UUID", "Matched_Norm_Sample_UUID", "HGVSc", "HGVSp", "HGVSp_Short", "Transcript_ID", "Exon_Number", "all_effects", "Allele", "Gene", "Feature", "Feature_type", "Consequence", "cDNA_position", "CDS_position", "Protein_position", "Amino_acids", "Codons", "Existing_variation", "ALLELE_NUM", "DISTANCE", "STRAND_VEP", "SYMBOL", "SYMBOL_SOURCE", "HGNC_ID", "BIOTYPE", "CANONICAL", "CCDS", "ENSP", "SWISSPROT", "TREMBL", "UNIPARC", "RefSeq", "SIFT", "PolyPhen", "EXON", "INTRON", "DOMAINS", "AF", "AFR_AF", "AMR_AF", "ASN_AF", "EAS_AF", "EUR_AF", "SAS_AF", "AA_AF", "EA_AF", "CLIN_SIG", "SOMATIC", "PUBMED", "MOTIF_NAME", "MOTIF_POS", "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "IMPACT", "PICK", "VARIANT_CLASS", "TSL", "HGVS_OFFSET", "PHENO", "MINIMISED", "ExAC_AF", "ExAC_AF_AFR", "ExAC_AF_AMR", "ExAC_AF_EAS", "ExAC_AF_FIN", "ExAC_AF_NFE", "ExAC_AF_OTH", "ExAC_AF_SAS", "GENE_PHENO", "FILTER", "flanking_bps", "vcf_id", "ExAC_AF_Adj", "ExAC_AC_AN_Adj", "ExAC_AC_AN", "ExAC_AC_AN_AFR", "ExAC_AC_AN_AMR", "ExAC_AC_AN_EAS", "ExAC_AC_AN_FIN", "ExAC_AC_AN_NFE", "ExAC_AC_AN_OTH", "ExAC_AC_AN_SAS", "ExAC_FILTER", "gnomAD_AF", "gnomAD_AFR_AF", "gnomAD_AMR_AF", "gnomAD_ASJ_AF", "gnomAD_EAS_AF", "gnomAD_FIN_AF", "gnomAD_NFE_AF", "gnomAD_OTH_AF", "gnomAD_SAS_AF")
LS0tCnRpdGxlOiAiQ3JlYXRlIGNvbnNlbnN1cyBob3RzcG90IG1hZiIKYXV0aG9yOiBLcnV0aWthIEdhb25rYXIgZm9yIEQzYgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKCi0tLQoKSW4gdGhlIHByZXZpb3VzIHN0ZXAgd2UgZ2F0aGVyIGFsbCBrbm93biBjYW5jZXIgaG90c3BvdHMgYnkgc2NhdmVuZ2luZyBjYWxscyBieQotIGNoZWNraW5nIGZvciBvdmVybGFwIHdpdGggYW1pbm8gYWNpZCBwb3NpdGlvbnMgaW4gYSBjdXJhdGVkIGFuZCBwdWJsaXNoZWQgY2FuY2VyIGhvdHNwb3QgW2RhdGFiYXNlXShodHRwczovL3d3dy5jYW5jZXJob3RzcG90cy5vcmcvZmlsZXMvaG90c3BvdHNfdjIueGxzKQotIGNoZWNraW5nIGZvciBvdmVybGFwIHdpdGggbm9uLWNvZGluZyByZWdpb24gaG90c3BvdHMgbXV0YXRpb25zICwgaGVyZSB3ZSBhcmUgdXNpbmcgdGhlIGluIFRFUlQgcHJvbW90ZXIgcmVnaW9uCgpJbiB0aGlzIG5vdGVib29rLCB3ZSB3aWxsIGNyZWF0ZSBjb25zZW5zdXMgbWFmcyBmcm9tIHRoZSBmaWx0ZXJlZCBjYWxsIGBzY3JhdGNoL2hvdHNwb3QtZGV0ZWN0aW9uLypSRFNgLiBTaW5jZSB0aGVyZSBhcmUgc2xpZ2h0IGRpZmZlcmVuY2VzIGluIHJlYWQgc3VwcG9ydCBmcm9tIGVhY2ggY2FsbGVyLCB3ZSB3aWxsIHRha2UgYSBtZWFuIHZhbHVlIHRvIHByb3ZpZGUgdGhlIGluZm9ybWF0aW9uIHBlciBzaXRlIGluIHRoZSBjb25zZW5zdXMgbWFmIGZpbGUuIEFsbCBvdGhlciB2Y2YybWFmIGNvbHVtbnMgd2lsbCBiZSBhZGRlZCBmcm9tIHN0cmVsa2EvbXV0ZWN0MiAgCgojIyBTZXR1cApgYGB7cn0KbGlicmFyeSgidGlkeXZlcnNlIikKbGlicmFyeSgibWFmdG9vbHMiKQoKcm9vdF9kaXIgPC0gcnByb2pyb290OjpmaW5kX3Jvb3QocnByb2pyb290OjpoYXNfZGlyKCIuZ2l0IikpCmRhdGFfZGlyIDwtIGZpbGUucGF0aChyb290X2RpciwgImRhdGEiKQoKcmVzdWx0c19kaXIgPC0gZmlsZS5wYXRoKHJvb3RfZGlyLAogICAgICAgICAgICAgICAgICAgICAgICAgImFuYWx5c2VzIiwKICAgICAgICAgICAgICAgICAgICAgICAgICJob3RzcG90cy1kZXRlY3Rpb24iLAogICAgICAgICAgICAgICAgICAgICAgICAgInJlc3VsdHMiKQojIE1ha2Ugb3V0cHV0IGZvbGRlcgppZiAoIWRpci5leGlzdHMocmVzdWx0c19kaXIgKSkgewogIGRpci5jcmVhdGUocmVzdWx0c19kaXIgLCByZWN1cnNpdmUgPSBUUlVFKQp9CgpzY3JhdGNoX2RpciA8LSBmaWxlLnBhdGgoCiAgcm9vdF9kaXIsCiAgInNjcmF0Y2giLAogICJob3RzcG90LWRldGVjdGlvbiIKKQoKIyBjb21iaW5lZCBob3RzcG90IGNhbGxzCmNvbWJpbmVkX2hvdHNwb3RfY2FsbHMgPC0gbGlzdC5maWxlcyhwYXRoID0gc2NyYXRjaF9kaXIscGF0dGVybiA9ICIuUkRTIixmdWxsLm5hbWVzID0gVFJVRSkgJT4lCiAgbWFwKHJlYWRSRFMpICU+JSAKICBiaW5kX3Jvd3MoKQoKIyBob3RzcG90IG11dGF0aW9uIG1hZiBmaWxlIG91dHB1dApvdXRwdXRfZmlsZSA8LSBmaWxlLnBhdGgocmVzdWx0c19kaXIsInBidGEtc252LWhvdHNwb3RzLW11dGF0aW9uLm1hZi50c3YuZ3oiKQoKCiMgUEJUQSBjb25zZW5zdXMgbWFmIAptYWZfY29sdHlwZXMgPC0gcmVhZFJEUyhmaWxlLnBhdGgoImlucHV0IiwibWFmX2NvbHR5cGVzLlJEUyIpKQpjb25zZW5zdXNfbWFmX2ZpbGUgPC0gcmVhZF90c3YoZmlsZS5wYXRoKGRhdGFfZGlyLCJwYnRhLXNudi1jb25zZW5zdXMtbXV0YXRpb24ubWFmLnRzdi5neiIpLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgY29sX3R5cGVzID0gcmVhZHI6OmNvbHMoRW50cmV6X0dlbmVfSWQgPSByZWFkcjo6Y29sX2NoYXJhY3RlcigpLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgQUxMRUxFX05VTSA9IHJlYWRyOjpjb2xfY2hhcmFjdGVyKCksCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB2Y2ZfcG9zID0gcmVhZHI6OmNvbF9jaGFyYWN0ZXIoKSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHZjZl9xdWFsID0gcmVhZHI6OmNvbF9udW1iZXIoKSkpCgpgYGAKCiMjIEdhdGhlciBjb25zZW5zdXMgbWFmIGZpbGUgZm9yIGhvdHNwb3RzCgpUaGUgbWFpbiB0YXNrcyBhcmU6CiAtIGNhbGN1bGF0ZSBhIG1lYW4gb2YgcmVhZCBzdXBwb3J0cyBmb3IgUkVGIGFuZCBBTFQgZnJvbSBtdWx0aXBsZSBjYWxsZXJzIHdoaWNoIGNhbiBiZSBzbGlnaHRseSBkaWZmZXJlbnQgZm9yIHRoZSBjb25zZW51cyBtYWYgZmlsZQogLSBjcmVhdGUgYSBtYWYgZm9ybWF0IG91dHB1dCB0byBiZSBhcHBlbmRlZCBvbiB0byB0aGUgMy8zIGNvbnNlbnN1cyBtYWYgZmlsZQoKYGBge3Igd2FybmluZz1GQUxTRSB9Cgp1bmlxdWVfY29scyA8LSBjKCJ0X2RlcHRoIiwKICAgICAgICAgICAgICAgICAibl9kZXB0aCIsCiAgICAgICAgICAgICAgICAgInRfcmVmX2NvdW50IiwKICAgICAgICAgICAgICAgICAibl9yZWZfY291bnQiLAogICAgICAgICAgICAgICAgICJ0X2FsdF9jb3VudCIsCiAgICAgICAgICAgICAgICAgIm5fYWx0X2NvdW50IiwKICAgICAgICAgICAgICAgICAiY2FsbGVyIiwKICAgICAgICAgICAgICAgICAidmNmX3F1YWwiKQoKCmNvbWJpbmVkX2hvdHNwb3RfY2FsbHMgPC0gY29tYmluZWRfaG90c3BvdF9jYWxscyAlPiUKICB1bmlxdWUoKSAlPiUKICBncm91cF9ieV9hdChzZXRkaWZmKG5hbWVzKGNvbWJpbmVkX2hvdHNwb3RfY2FsbHMpLCB1bmlxdWVfY29scykpICU+JQogIHN1bW1hcmlzZSgKICAgICAgICAgICAgdF9kZXB0aD0gbWVhbihhcy5udW1lcmljKHRfZGVwdGgpKSwKICAgICAgICAgICAgbl9kZXB0aCA9IG1lYW4oYXMubnVtZXJpYyhuX2RlcHRoKSksCiAgICAgICAgICAgIHRfcmVmX2NvdW50ID0gbWVhbihhcy5udW1lcmljKHRfcmVmX2NvdW50KSksCiAgICAgICAgICAgIG5fcmVmX2NvdW50ID0gbWVhbihhcy5udW1lcmljKG5fcmVmX2NvdW50KSksCiAgICAgICAgICAgIHRfYWx0X2NvdW50ID0gbWVhbihhcy5udW1lcmljKHRfYWx0X2NvdW50KSksCiAgICAgICAgICAgIG5fYWx0X2NvdW50ID0gbWVhbihhcy5udW1lcmljKG5fYWx0X2NvdW50KSksCiAgICAgICAgICAgIGNhbGxlcl9jb3VudCA9IG4oKSwKICAgICAgICAgICAgY2FsbGVyID0gdG9TdHJpbmcoY2FsbGVyKSwKICAgICAgICAgICAgdmNmX3F1YWwgPSBtZWFuKGFzLm51bWVyaWModmNmX3F1YWwpKSkgJT4lIAogIHVuZ3JvdXAoKSAlPiUKICBtdXRhdGUoVkFGPXRfYWx0X2NvdW50Lyh0X3JlZl9jb3VudCt0X2FsdF9jb3VudCkpCgpgYGAKCiMjIE1lYW4gcmVhZCBzdXBwb3J0IGZvciBjb21iaW5hdGlvbiBvZiBjYWxsZXJzCgpXZSB3aWxsIHRyeSB0byBpZGVudGlmeSB0aGUgcmVhZCBzdXBwb3J0IGZvciBlYWNoIGNvbWJpbmF0aW9uIG9mIGNhbGxlcnMgdG8gdmlzdWFsaXplIGlmIGFueSBjYWxsZXJzIGNhbGwgaG9zdHNwb3Qgc2l0ZXMgd2l0aCBsb3cgcmVhZCBzdXBwb3J0IGNhbGxzIHdoaWNoCmBgYHtyfQoKZ2dwbG90KGNvbWJpbmVkX2hvdHNwb3RfY2FsbHNbLGMoImNhbGxlciIsImNhbGxlcl9jb3VudCIsInRfYWx0X2NvdW50IildLAogICAgICAgYWVzKHg9YXMuZmFjdG9yKGNhbGxlcl9jb3VudCkseT10X2FsdF9jb3VudCkpKwogIGdlb21fdmlvbGluKCkrIAogIGdncHVicjo6c3RhdF9jb21wYXJlX21lYW5zKCkKCmBgYApIb3RzcG90IHNpdGVzIGNhbGxlZCBieSBqdXN0IDEgY2FsbGVyIHNlZW0gdG8gaGF2ZSBtYWpvcml0eSBsb3cgcmVhZCBzdXBwb3J0LiBMZXRzIGxvb2sgYXQgdGhlc2UgY2FsbHMgdGhhdCBhcmUgdW5pcXVlIHRvIDEgY2FsbGVyLgoKYGBge3J9Cgp1bmlxdWVfMWNhbGxlcl9ob3RzcG90czwtIGNvbWJpbmVkX2hvdHNwb3RfY2FsbHMgJT4lCiAgZmlsdGVyKGNhbGxlcl9jb3VudD09MSkKCnVuaXF1ZV8xY2FsbGVyX2hvdHNwb3RzCmBgYAoKV2hhdCdzIHRoZSBkaXN0cmJ1dGlvbiBvZiB0aGUgdF9hbHRfY291bnQKYGBge3J9CgpzdW1tYXJ5LmNhbGxlciA8LSB1bmlxdWVfMWNhbGxlcl9ob3RzcG90cyAlPiUgIAogIGdyb3VwX2J5KGNhbGxlcikgJT4lCiAgdGFsbHkoKQoKCmdncGxvdCh1bmlxdWVfMWNhbGxlcl9ob3RzcG90c1ssYygiY2FsbGVyIiwidF9hbHRfY291bnQiKV0sCiAgICAgICBhZXMoeD1hcy5mYWN0b3IoY2FsbGVyKSx5PXRfYWx0X2NvdW50KSkrCiAgZ2VvbV92aW9saW4oKSsgCiAgZ2VvbV9qaXR0ZXIoaGVpZ2h0ID0gMCwgd2lkdGggPSAwLjEpICsKICAjIGFkZCBudW1iZXIgb2YgY2FsbHMgYXQgdGhlIHRvcAogIGdlb21fdGV4dChkYXRhPXN1bW1hcnkuY2FsbGVyICxhZXMoeCA9IGNhbGxlciwgeSA9IDM1MCwgbGFiZWw9biksIGZvbnRmYWNlID0yLCBzaXplID0gNCkrCiAgIyBkcmF3IGEgbGluZSB3aGVyZSB0X2FsdF9jb3VudD09NQogIGdlb21faGxpbmUoYWVzKHlpbnRlcmNlcHQ9NSxjb2xvcj0icmVkIikpKwogIHRoZW1lKGxlZ2VuZC5wb3NpdGlvbj0ibm9uZSIpCgoKYGBgCgpTdHJlbGthIGhhcyBtYWpvcml0eSBvZiB0aGUgdW5pcXVlIGNhbGxzIHdpdGggYSB3aWRlIGRpc3RyaWJ1dGlvbiBvZiB0X2FsdF9jb3VudC4gU29tZSBjYWxscyBmcm9tIGxhbmNldCxtdXRlY3QyIGFyZSBiZWxvdyB0aGUgcmVkIGxpbmUgd2hpY2ggZGVub3RlcyB0X2FsdF9jb3VudD09NS4KCiMjIE1lcmdlIHRoZSBjb21iaW5lZCBob3RzcG90cyB3aXRoIGNvbnNlbnN1cyAKCldlIHdpbGwgZmlyc3QgZmlsdGVyIGBjb21iaW5lZF9ob3RzcG90X2NhbGxzYCB0byBrZWVwIG9ubHkgcm93cyB0aGF0IGFyZSBub3QgZm91bmQgaW4gYGNvbnNlbnN1c19tYWZfZmlsZWAKYGBge3J9CgpjaGVja193aXRoX2NvbnNlbnN1czwtIGNvbnNlbnN1c19tYWZfZmlsZSAlPiUKICBtdXRhdGUoImNvbnNlbnN1cyI9IlllcyIpICU+JQogIHNlbGVjdCgtb25lX29mKHVuaXF1ZV9jb2xzKSwtVkFGLC12Y2ZfcG9zKQoKY29tYmluZWRfaG90c3BvdF9jYWxscyAlPiUKICBzZWxlY3QoLWNhbGxlcl9jb3VudCwtY2FsbGVyLC1BbWlub19BY2lkX1Bvc2l0aW9uLC11bmlxdWVfY29scywtVkFGLC12Y2ZfcG9zKSAlPiUKICBtdXRhdGUoaG90c3BvdD0iWWVzIikgJT4lCiAgbGVmdF9qb2luKGNoZWNrX3dpdGhfY29uc2Vuc3VzKSAlPiUKICBmaWx0ZXIoaXMubmEoY29uc2Vuc3VzKSkKCgpgYGAKCgo=